IMPORT

1. Librairies

library(phyloseq) # for phyloseq object
library(ggplot2)
library(ggsignif)
library(cowplot)
library(tidyverse)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap

2. Data

# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"
path.plots <- file.path(path, "data/analysis-individual/PLOTS/plots-LoPresti-EDA")

# Import phyloseq object
physeq.lopresti <- readRDS(file.path(path, "phyloseq-objects/physeq_lopresti.rds"))

# Sanity check
physeq.lopresti
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 806 taxa and 57 samples ]
## sample_data() Sample Data:       [ 57 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 806 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 806 tips and 801 internal nodes ]
## refseq()      DNAStringSet:      [ 806 reference sequences ]

Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.

# Look at the tree
plot_tree(physeq.lopresti, color = "Phylum", ladderize="left")

DEMOGRAPHICS

This dataset has several covariates (gender, age, bmi, sample_storage_duration). We will check whether there is the same distribution of these covariates between healthy and IBS patients.

# Number of individuals in each group (keeping just 1 sample per individual)
metadata <- data.frame(sample_data(physeq.lopresti)) %>%
  select(host_disease, host_age, host_sex, host_ID) %>%
  group_by(host_ID) %>%
  summarise_all(first)
  
metadata %>%
  count(host_disease)
# Age
metadata %>%
  group_by(host_disease) %>%
  summarize(mean_age=mean(host_age), sd_age=sd(host_age))
wilcox.test(metadata[metadata$host_disease == "IBS", ]$host_age,
            metadata[metadata$host_disease == "Healthy", ]$host_age) # p=0.25
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata[metadata$host_disease == "IBS", ]$host_age and metadata[metadata$host_disease == "Healthy", ]$host_age
## W = 281, p-value = 0.2536
## alternative hypothesis: true location shift is not equal to 0
# Gender
metadata %>%
  count(host_disease, host_sex)
chisq.test(data.frame("Female" = c(12,17),
                      "Male" = c(18,6))) # p=0.02 -> more females than males in IBS group
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  data.frame(Female = c(12, 17), Male = c(18, 6))
## X-squared = 4.7518, df = 1, p-value = 0.02927

ABUNDANCES

1. Absolute abundances

# Plot Phylum
plot_bar(physeq.lopresti, fill = "Phylum") + facet_wrap("sample_type", scales="free_x") +
  theme(axis.text.x = element_text(size = 5))+
  labs(x = "Samples", y = "Absolute abundance", title = "Lo Presti dataset (2019)")+
  theme_cowplot()+
  theme(axis.text.x = element_text(size=6, angle=90))

# ggsave(file.path(path.plots, "absAbundance_phylum.jpg"), width=13, height=5)

# Plot Class
plot_bar(physeq.lopresti, fill = "Class")+ facet_wrap("sample_type", scales="free_x") +
  theme(axis.text.x = element_text(size = 5))+
  labs(x = "Samples", y = "Absolute abundance", title = "Lo Presti dataset (2019)")+
  theme_cowplot()+
  theme(axis.text.x = element_text(size=6, angle=90))

Sequencing depth characteristics of the Hugerth dataset:
- minimum of 504 total count per sample
- median: 897 total count per sample
- maximum of 3838 total count per sample

2. Relative abundances

# Agglomerate to phylum & class levels
phylum.table <- physeq.lopresti %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

class.table <- physeq.lopresti %>%
  tax_glom(taxrank = "Class") %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt()


# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ sample_type + host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "Lo Presti dataset (2019)")

# ggsave(file.path(path.plots, "relAbundance_phylum.jpg"), width=10, height=5)

ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(class.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                        y = Abundance, fill = Class))+
  facet_wrap(~ sample_type + host_disease, scales = "free") +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank(),
        legend.key.size = unit(0.2, 'cm'),
        legend.text = element_text(size=8))+
  labs(x = "Samples", y = "Relative abundance", title = "Lo Presti dataset (2019)")

# ggsave(file.path(path.plots, "relAbundance_class.jpg"), width=12, height=5)

3. Firmicutes/Bacteroidota ratio

# Extract abundance of only Bacteroidota and Firmicutes
relevant.covariates <- c('Sample', 'Abundance', 'host_disease', 'host_subtype', 'Phylum', 'sample_type', 'host_ID', 'host_age', 'host_sex')

bacter <- phylum.table %>%
  filter(Phylum == "Bacteroidota") %>%
  select(all_of(relevant.covariates)) %>%
  dplyr::rename(Bacteroidota = Abundance) %>%
  select(-Phylum)

firmi <- phylum.table %>%
  filter(Phylum == "Firmicutes") %>%
  select(all_of(relevant.covariates)) %>%
  dplyr::rename(Firmicutes = Abundance) %>%
  select(-Phylum)

# Calculate log2 ratio Firmicutes/Bacteroidota
ratio.FB <- left_join(x=bacter, y=firmi, by=c('Sample', 'host_disease', 'host_subtype', 'sample_type', 'host_ID', 'host_age', 'host_sex')) %>%
  relocate(Firmicutes, .after=Bacteroidota) %>%
  # Compute log ratios
  mutate(logRatioFB = log2(Firmicutes/Bacteroidota))


# Plot log2 ratio Firmicutes/Bacteroidota
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  facet_wrap(~sample_type) +
  geom_signif(comparisons = list(c("Healthy", "IBS")), map_signif_level = TRUE, test="wilcox.test") +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Firmicutes:Bacteroidota ratio")+
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB.jpg"), width=5, height=5)

# Statistical test sigmoid samples
wilcox.test(ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy","logRatioFB"]) # p = 0.5
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 18, p-value = 0.5074
## alternative hypothesis: true location shift is not equal to 0
# Statistical test stool samples
wilcox.test(ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy","logRatioFB"]) # p=0.01 significant difference in stool samples
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 143.5, p-value = 0.01208
## alternative hypothesis: true location shift is not equal to 0
# Plot log2 ratio Firmicutes/Bacteroidota per IBS subtype
ggplot(ratio.FB %>% filter(sample_type == "stool"), aes(x = host_subtype, y = logRatioFB))+
  geom_violin(aes(fill=host_subtype))+
  scale_fill_manual(values=scales::alpha(c("blue", "brown", "red", "orange"), .3))+
  geom_jitter(width=0.1)+
  geom_signif(comparisons = list(c("HC", "IBS-C"), c("HC", "IBS-D"), c("HC", "IBS-M")),
              map_signif_level = TRUE, test="wilcox.test", step_increase=0.1) +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Fecal samples")+
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB_subtype.jpg"), width=5, height=5)


# Paired data
ggplot(ratio.FB, aes(x = sample_type, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_point()+
  geom_line(aes(group=host_ID), lwd=0.1) +
  facet_wrap(~host_disease) +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Paired data") +
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB_paired-data.jpg"), width=5, height=5)

# Statistical test sigmoid vs stool samples
wilcox.test(ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy","logRatioFB"]) # p=0.001
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 172, p-value = 0.001039
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS","logRatioFB"]) # p=0.04
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS", "logRatioFB"]
## W = 63, p-value = 0.04382
## alternative hypothesis: true location shift is not equal to 0

NORMALIZE DATA

# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.lopresti)<500) # all FALSE

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.lopresti
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts

# Sanity check that 0 values have been replaced
# otu_table(physeq.lopresti)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]

# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1

# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/LoPresti-2019/02_EDA-LoPresti/physeq_NZcomp.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.lopresti
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) )

# check the counts are all relative
# otu_table(physeq.lopresti)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]

# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/LoPresti-2019/02_EDA-LoPresti/physeq_relative.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.lopresti
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )

# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total

# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/LoPresti-2019/02_EDA-LoPresti/physeq_CSN.rds"))


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.lopresti
physeq.clr <- microbiome::transform(physeq.lopresti, "clr") # the function adds pseudocounts itself

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
# otu_table(physeq.lopresti)[1:5, 1:5] # should contain absolute counts
# otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/LoPresti-2019/02_EDA-LoPresti/physeq_clr.rds"))

COMPUTE DISTANCES

1. UniFrac, Aitchison, Bray-Curtis and Canberra

First, let’s look at these four distances of interest.

#____________________________________________________________________________________
# Measure distances
getDistances <- function(relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
  # sanity check
  cat("nb samples relPhyseq:", nsamples(relPhyseq), "\n")
  cat("nb samples clrPhyseq:", nsamples(clrPhyseq), "\n")
  cat("nb samples csnPhyseq:", nsamples(csnPhyseq), "\n")
  cat("nb samples nzcompPhyseq:", nsamples(nzcompPhyseq), "\n")
  
  # Compute distances
  print("Unifrac...")
  set.seed(123) # for unifrac, need to set a seed
  glom.UniF <- UniFrac(relPhyseq, weighted=TRUE, normalized=TRUE) # weighted unifrac
  print("Aitchison...")
  glom.ait <- phyloseq::distance(clrPhyseq, method = 'euclidean') # aitchison
  print("Bray des bois...")
  glom.bray <- phyloseq::distance(csnPhyseq, method = "bray") # bray-curtis
  print("Canberra <3...")
  glom.can <- phyloseq::distance(nzcompPhyseq, method = "canberra") # canberra
  
  dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
  
  return(dist.list)
}


#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS", relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
  plist <- NULL
  plist <- vector("list", 4)
  names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
  
  print("Unifrac")
  # Weighted UniFrac
  set.seed(123)
  iMDS.UniF <- ordinate(relPhyseq, ordination, distance=dlist$UniF)
  plist[[1]] <- plot_ordination(relPhyseq, iMDS.UniF, color="host_disease")
  
  print("Aitchison")
  # Aitchison
  set.seed(123)
  iMDS.Ait <- ordinate(clrPhyseq, ordination, distance=dlist$Ait)
  plist[[2]] <- plot_ordination(clrPhyseq, iMDS.Ait, color="host_disease")
  
  print("Bray")
  # Bray-Curtis
  set.seed(123)
  iMDS.Bray <- ordinate(csnPhyseq, ordination, distance=dlist$Bray)
  plist[[3]] <- plot_ordination(csnPhyseq, iMDS.Bray, color="host_disease")
  
  print("Canberra")
  # Canberra
  set.seed(123)
  iMDS.Can <- ordinate(nzcompPhyseq, ordination, distance=dlist$Can)
  plist[[4]] <- plot_ordination(nzcompPhyseq, iMDS.Can, color="host_disease")
  
  # Creating a dataframe to plot everything
  plot.df = plyr::ldply(plist, function(x) x$data)
  names(plot.df)[1] <- "distance"
  
  return(plot.df)
}

Now let’s plot!

a) Fecal samples

#________________
# FECAL DATA
#________________

# Get the distances & the plot data
dist.lopresti.fecal <- getDistances(relPhyseq = subset_samples(physeq.rel, sample_type=="stool"),
                                    clrPhyseq = subset_samples(physeq.clr, sample_type=="stool"),
                                    csnPhyseq = subset_samples(physeq.CSN, sample_type=="stool"),
                                    nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="stool"))
## nb samples relPhyseq: 46 
## nb samples clrPhyseq: 46 
## nb samples csnPhyseq: 46 
## nb samples nzcompPhyseq: 46 
## [1] "Unifrac..."
## Warning in UniFrac(relPhyseq, weighted = TRUE, normalized = TRUE): Randomly
## assigning root as -- ASV415 -- in the phylogenetic tree in the data you
## provided.
## Warning in matrix(tree$edge[order(tree$edge[, 1]), ][, 2], byrow = TRUE, : data
## length [1609] is not a sub-multiple or multiple of the number of rows [805]
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
# Error message: "data length [1609] is not a sub-multiple or multiple of the number of rows [805]"
# Let's check if some nodes have more than 2 children, which could explain why there is an odd number of rows in the edge matrix
edges <- phy_tree(physeq.rel)$edge
mycounts <- table(edges[,1])
length(mycounts[mycounts==2])
## [1] 801
length(mycounts[mycounts!=2])
## [1] 2
mycounts[mycounts!=2] # There are 2 nodes with 3 children
## 
##  807 1600 
##    3    3
# Let's get a dichotomy tree to avoid this error
phy_tree(physeq.rel) <- ape::multi2di(phy_tree(physeq.rel))


# Re-run the distances
dist.lopresti.fecal <- getDistances(relPhyseq = subset_samples(physeq.rel, sample_type=="stool"),
                                    clrPhyseq = subset_samples(physeq.clr, sample_type=="stool"),
                                    csnPhyseq = subset_samples(physeq.CSN, sample_type=="stool"),
                                    nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="stool"))
## nb samples relPhyseq: 46 
## nb samples clrPhyseq: 46 
## nb samples csnPhyseq: 46 
## nb samples nzcompPhyseq: 46 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.fecal <- plotDistances2D(dlist=dist.lopresti.fecal,
                                 relPhyseq = subset_samples(physeq.rel, sample_type=="stool"),
                                 clrPhyseq = subset_samples(physeq.clr, sample_type=="stool"),
                                 csnPhyseq = subset_samples(physeq.CSN, sample_type=="stool"),
                                 nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="stool"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
ggplot(plot.df.fecal, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease", title="Fecal samples")

# ggsave(file.path(path.plots, "distances4_MDS_stool.jpg"), height = 4, width = 15)

b) Sigmoid samples

#________________
# SIGMOID DATA
#________________

# Get the distances & the plot data
dist.lopresti.sigmoid <- getDistances(relPhyseq = subset_samples(physeq.rel, sample_type=="sigmoid"),
                                    clrPhyseq = subset_samples(physeq.clr, sample_type=="sigmoid"),
                                    csnPhyseq = subset_samples(physeq.CSN, sample_type=="sigmoid"),
                                    nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="sigmoid"))
## nb samples relPhyseq: 11 
## nb samples clrPhyseq: 11 
## nb samples csnPhyseq: 11 
## nb samples nzcompPhyseq: 11 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.sigmoid <- plotDistances2D(dlist=dist.lopresti.sigmoid,
                                   relPhyseq = subset_samples(physeq.rel, sample_type=="sigmoid"),
                                   clrPhyseq = subset_samples(physeq.clr, sample_type=="sigmoid"),
                                   csnPhyseq = subset_samples(physeq.CSN, sample_type=="sigmoid"),
                                   nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="sigmoid"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
ggplot(plot.df.sigmoid, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease", title="Sigmoid samples")

# ggsave(file.path(path.plots, "distances4_MDS_sigmoid.jpg"), height = 4, width = 15)

c) All samples

#________________
# ALL DATA
#________________

# Get the distances & the plot data
dist.lopresti <- getDistances()
## nb samples relPhyseq: 57 
## nb samples clrPhyseq: 57 
## nb samples csnPhyseq: 57 
## nb samples nzcompPhyseq: 57 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df <- plotDistances2D(dlist=dist.lopresti)
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
p1 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20),
        axis.title.x = element_blank())+
  labs(color="Disease")
p2 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=sample_type))+
  geom_line(aes(group=host_ID), color="black", lwd=0.1)+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('#33CC00', 'purple'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank())+
  labs(color="Sample type")
ggpubr::ggarrange(p1,p2, nrow=2)

# ggsave(file.path(path.plots, "distances4_MDS_all.jpg"), height = 8, width = 15)

2. Plot in 3D

For better visualization, we will also take a glance at reduction to 3D.

#____________________________________________________________________________________
# Plot 3D ordination
plotDistances3D <- function(d, name_dist, physeq=physeq.lopresti){
  
  # Reset parameters
  mds.3D <- NULL
  xyz <- NULL
  fig.3D <- NULL
  
  # Reduce distance matrix to 3 dimensions
  set.seed(123)
  mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
  xyz <- scores(mds.3D, display="sites") # pull out the (x,y,z) coordinates
  
  # Plot
  fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
                    color=sample_data(physeq)$host_disease, colors = c("blue", "red"))%>%
    layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
  
  return(fig.3D)
}

Now let’s plot!

# Fecal samples
plotDistances3D(dist.lopresti.fecal$UniF, "UniFrac", subset_samples(physeq.lopresti, sample_type=="stool"))
plotDistances3D(dist.lopresti.fecal$Ait, "Aitchison", subset_samples(physeq.lopresti, sample_type=="stool"))
plotDistances3D(dist.lopresti.fecal$Canb, "Canberra", subset_samples(physeq.lopresti, sample_type=="stool"))
plotDistances3D(dist.lopresti.fecal$Bray, "Bray-Curtis", subset_samples(physeq.lopresti, sample_type=="stool"))
# Note: for Bray-Curtis, all the samples are accumulated next to each other at coordinates (0,0,0)

HIERARCHICAL CLUSTERING

# For heatmaps: have group color
matcol <- data.frame(phenotype = sample_data(physeq.lopresti)[,"host_disease"],
                     host_subtype = sample_data(physeq.lopresti)[,"host_subtype"],
                     sample = sample_data(physeq.lopresti)[,"sample_type"])


# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
  
  # Initialize variables
  i=1
  plist <- vector("list", 4)
  names(plist) <- names(dlist)
  
  # Loop through distances
  for(d in dlist){
    plist[[i]] <- pheatmap(as.matrix(d), 
                          clustering_distance_rows = d,
                          clustering_distance_cols = d,
                          fontsize = fontsize,
                          show_rownames = F,
                          show_colnames = F,
                          annotation_col = matcol,
                          # annotation_row = matcol,
                          annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   sample_type = c("stool" = "purple", 'sigmoid' = '#33CC00'),
                                                   host_subtype = c('HC'='black', 'IBS-M'='orange', 'IBS-C'='brown', 'IBS-D'='red')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', # hc method
                          main = names(dlist)[i]) # have name of distance as title
    i <- i+1
  }
  
  return(plist)
}


# Get the heatmaps
heatmp.lopresti <- plotHeatmaps(dlist = dist.lopresti, fontsize = 8)

REPRODUCE PLOTS FROM PAPER

Figure 1B and 3B - Bar chart phyla relative abundances

# Figure 1B and 3B
phylum.table$host_disease <- factor(phylum.table$host_disease, levels = c('IBS', 'Healthy')) # have same order as paper
phylum.table$sample_type <- factor(phylum.table$sample_type, levels = c('stool', 'sigmoid'))

ggplot(phylum.table, aes(x = host_disease, y = Abundance, fill = Phylum))+
  geom_bar(stat = "identity", position = "fill") +
  facet_wrap(~sample_type)+
  theme_cowplot()+
  theme(axis.text.x = element_text(size = 10),
        axis.title.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "Stool samples")


For the fecal samples, we get a very similar plot as in the original paper (decrease Firmicutes in IBS, increase Bacteroidota, increase Verrucomicrobiota). For the sigmoid samples, comparison with the original paper is limited, as we discarded most of the samples (only 11 sigmoid mucosa samples left).

Figure 2B and 4B - Boxplot of selected genera relative abundance

# Get relative abundance of families
genus.table <- physeq.lopresti %>%
  tax_glom(taxrank = "Genus") %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt()

# Check at genera present
table(genus.table$Genus)
## 
##               Acidaminococcus                  Agathobacter 
##                            57                            57 
##                   Akkermansia                     Alistipes 
##                            57                            57 
##                Alloprevotella                  Anaerostipes 
##                            57                            57 
##                 Anaerotruncus                   Bacteroides 
##                            57                            57 
##                   Barnesiella                       Blautia 
##                            57                            57 
##                Butyricicoccus                 Butyricimonas 
##                            57                            57 
##                       CAG-873               Catenibacterium 
##                            57                            57 
##              Cellulosilyticum Christensenellaceae_R-7_group 
##                            57                            57 
##                   Citrobacter   Clostridium_sensu_stricto_1 
##                            57                            57 
##                   Coprobacter                   Coprococcus 
##                            57                            57 
##                     Dialister                        Dielma 
##                            57                            57 
##                         Dorea                        DTU089 
##                            57                            57 
##                Eisenbergiella        Erysipelatoclostridium 
##                            57                            57 
##   Erysipelotrichaceae_UCG-003          Escherichia/Shigella 
##                            57                            57 
##                   Eubacterium              Faecalibacterium 
##                            57                            57 
##                  Faecalitalea      Family_XIII_AD3011_group 
##                            57                            57 
##                Flavobacterium                Flavonifractor 
##                            57                            57 
##              Fusicatenibacter                 Fusobacterium 
##                            57                            57 
##                     Georgenia                Granulicatella 
##                            57                            57 
##                   Haemophilus        Hafnia-Obesumbacterium 
##                            57                            57 
##                  Holdemanella                    Holdemania 
##                            57                            57 
##                Incertae_Sedis             Lachnoclostridium 
##                            57                            57 
##                   Lachnospira Lachnospiraceae_NK4A136_group 
##                            57                            57 
##       Lachnospiraceae_UCG-001                 Lactobacillus 
##                            57                            57 
##                    Monoglobus              Negativibacillus 
##                            57                            57 
##                 NK4A214_group                   Odoribacter 
##                            57                            57 
##                 Oscillibacter                  Oscillospira 
##                            57                            57 
##               Parabacteroides                Parasutterella 
##                            57                            57 
##              Peptoclostridium         Phascolarctobacterium 
##                            57                            57 
##                    Prevotella   Prevotellaceae_NK3B31_group 
##                            57                            57 
##        Prevotellaceae_UCG-003                       Proteus 
##                            57                            57 
##          Pseudoflavonifractor                   Pseudomonas 
##                            57                            57 
##                Pyramidobacter   Rikenellaceae_RC9_gut_group 
##                            57                            57 
##                 Robinsoniella                    Romboutsia 
##                            57                            57 
##                     Roseburia                  Ruminococcus 
##                            57                            57 
##                Shuttleworthia                       Slackia 
##                            57                            57 
##                 Streptococcus               Subdoligranulum 
##                            57                            57 
##              Succiniclasticum                 Succinivibrio 
##                            57                            57 
##                    Sutterella                  Turicibacter 
##                            57                            57 
##                    Tuzzerella                       UBA1819 
##                            57                            57 
##                       UCG-002                       UCG-003 
##                            57                            57 
##                       UCG-005 
##                            57
ggplot(genus.table %>% filter(Genus %in% c("Bacteroides", "Parabacteroides", "Pseudomonas", "Prevotella", "Anaerostipes", "Eubacterium", "Haemophilus")),
       aes(x=host_disease, y=Abundance, fill=Genus))+
  geom_boxplot(outlier.shape = NA, position=position_dodge(0.75))+
  geom_point(position=position_dodge(0.75), aes(color=Genus))+
  theme_cowplot()+
  labs(x="", y="Relative abundance")